{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f2915b16",
   "metadata": {},
   "source": [
    "# Mnist example"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bba90df0",
   "metadata": {},
   "source": [
    "## Load some libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b3feaa3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import zero_one_loss\n",
    "from PIL import Image  \n",
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "np.random.seed(12345)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e40ea77b",
   "metadata": {},
   "source": [
    "## read the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b28d76f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \\\n",
      "0      1       0       0       0       0       0       0       0       0   \n",
      "1      0       0       0       0       0       0       0       0       0   \n",
      "2      1       0       0       0       0       0       0       0       0   \n",
      "3      4       0       0       0       0       0       0       0       0   \n",
      "4      0       0       0       0       0       0       0       0       0   \n",
      "\n",
      "   pixel8  ...  pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  \\\n",
      "0       0  ...         0         0         0         0         0         0   \n",
      "1       0  ...         0         0         0         0         0         0   \n",
      "2       0  ...         0         0         0         0         0         0   \n",
      "3       0  ...         0         0         0         0         0         0   \n",
      "4       0  ...         0         0         0         0         0         0   \n",
      "\n",
      "   pixel780  pixel781  pixel782  pixel783  \n",
      "0         0         0         0         0  \n",
      "1         0         0         0         0  \n",
      "2         0         0         0         0  \n",
      "3         0         0         0         0  \n",
      "4         0         0         0         0  \n",
      "\n",
      "[5 rows x 785 columns]\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_csv(\"train.csv\")\n",
    "X = data.iloc[:, 1:]\n",
    "y = data['label']\n",
    "print(data.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95f5d6c4",
   "metadata": {},
   "source": [
    "## show an image"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "f6cb3451",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAABwAAAAcCAAAAABXZoBIAAABHUlEQVR4nNXRL0gDcRQH8C/iOGEo4hjcglMEDbcyTCIYFrSJFqNBFxRZUwSjSa4Z1hYUsZoUBDUYhWGTCYIuXLgFBZF56fv1ZziR3Z9k86UHH97j/QH+bVi2vSZJ5qKcsOIVKZIUvdGYTTVIivSWXsh9AOj/tZV6Lkz8m1YRQQRLjUETZs52HhiLzPIgI6nz7GBTRvf5Xiy0KbI1Akw8Uu2Z6DjVQOQirIUmeVaIL+KIfN04Jp9qKSeok6T8raEUgy1J5shKs9Iu37uUP5mk3Gmg6+lqIM4ncbnD5ixwIO4lewa8ywLYSas80W0WAM6NejC8bWbYXH5mHKxWvgzi2DeAWsWaA4DuW7xr9tD7+fN6yiYoux/yXXc8hf4S317oi7jFz3QEAAAAAElFTkSuQmCC",
      "text/plain": [
       "<PIL.Image.Image image mode=L size=28x28>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tmp = np.array(X.iloc[10].values.reshape((28,28))).astype(np.uint8)\n",
    "img = Image.fromarray(tmp)\n",
    "img"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "31307bfd",
   "metadata": {},
   "source": [
    "## split the data to train and test sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0ca23643",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = \\\n",
    "    train_test_split(X, y, test_size=0.25, random_state=1179)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9a6318b",
   "metadata": {},
   "source": [
    "## apply decision tree learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "08421baa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Decision Tree  error percentage =  15.171428571428569 %\n"
     ]
    }
   ],
   "source": [
    "clf = DecisionTreeClassifier()    \n",
    "clf.fit(X_train,y_train )\n",
    "#\n",
    "y_pred = clf.predict(X_test)\n",
    "#\n",
    "print(\"Decision Tree  error percentage = \", 100*zero_one_loss(y_pred, y_test),\"%\")\n",
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d1a03b40",
   "metadata": {},
   "source": [
    "## apply random forest learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "4ded50ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Random Forest error percentage =  3.77142857142857 %\n"
     ]
    }
   ],
   "source": [
    "clf = RandomForestClassifier()    \n",
    "clf.fit(X_train,y_train )\n",
    "#\n",
    "y_pred = clf.predict(X_test)\n",
    "#\n",
    "print(\"Random Forest error percentage = \", 100*zero_one_loss(y_pred, y_test),\"%\")\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bc8d5dc5",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}